{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "23cc9dfe-9feb-4921-8fe6-a2609f226d79",
   "metadata": {},
   "source": [
    "# Download Bo20 and Bo767"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8ae04a2-f257-4258-90b8-cb0eff0c49c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "from math import floor\n",
    "\n",
    "import pickle\n",
    "\n",
    "import requests\n",
    "import zipfile\n",
    "import io"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e045f506-f044-47df-bd6a-9d3e4f4f9e27",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_zip_range(zip_id):\n",
    "    lower = floor(int(zip_id)/1000) * 1000\n",
    "    upper = lower + 999\n",
    "    return str(lower).zfill(4) + \"-\" + str(upper).zfill(4) + \"/\"\n",
    "\n",
    "def download_pdfs(download_path, pdf_ids):\n",
    "    dc_base_url = \"https://corp.digitalcorpora.org/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/\"\n",
    "\n",
    "    to_download = {}\n",
    "    for pdf_id in pdf_ids:\n",
    "        pdf_path = os.path.join(download_path, pdf_id+\".pdf\")\n",
    "        if not os.path.exists(pdf_path):\n",
    "            if pdf_id[:4] in to_download:\n",
    "                to_download[pdf_id[:4]] += [pdf_id]\n",
    "            else:\n",
    "                to_download[pdf_id[:4]] = [pdf_id]\n",
    "\n",
    "    for zip_id, sub_ids in to_download.items():\n",
    "        print(f\"Downloading: {zip_id}.zip. Note: this is a large zipfile so it may take a while\")\n",
    "        full_url = os.path.join(dc_base_url, get_zip_range(zip_id), zip_id+\".zip\")\n",
    "\n",
    "        if not os.path.exists(os.path.join(download_path, \"temp\")):\n",
    "            os.makedirs(os.path.join(download_path, \"temp\"))\n",
    "\n",
    "        r = requests.get(full_url)\n",
    "        z = zipfile.ZipFile(io.BytesIO(r.content))\n",
    "        z.extractall(os.path.join(download_path, \"temp\"))\n",
    "\n",
    "        # Move desired files to download folder\n",
    "        for pdf_id in sub_ids:\n",
    "            os.rename(os.path.join(download_path, \"temp\", pdf_id + \".pdf\"), os.path.join(download_path, pdf_id + \".pdf\"))\n",
    "\n",
    "        # Delete excess pdfs\n",
    "        shutil.rmtree(os.path.join(download_path, \"temp\"), ignore_errors=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0763ed14-a4bb-443f-8b6d-e07547074195",
   "metadata": {},
   "source": [
    "## Bo 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e6e9077-301e-41cf-af0e-c853f8cb47d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "bo20_path = \"../data/bo20/\"\n",
    "\n",
    "bo20_ids = [\n",
    "    \"1016445\",\n",
    "    \"1177640\",\n",
    "    \"1479052\",\n",
    "    \"1690009\",\n",
    "    \"2132230\",\n",
    "    \"1037700\",\n",
    "    \"1238975\",\n",
    "    \"1598224\",\n",
    "    \"2049749\",\n",
    "    \"2151932\",\n",
    "    \"1043219\",\n",
    "    \"1375277\",\n",
    "    \"1620834\",\n",
    "    \"2062555\",\n",
    "    \"2189929\",\n",
    "    \"1061225\",\n",
    "    \"1422550\",\n",
    "    \"1666072\",\n",
    "    \"2127440\",\n",
    "    \"2399488\",\n",
    "]\n",
    "\n",
    "download_pdfs(bo20_path, bo20_ids)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6639aae6-de2f-4a22-bfa2-826750c92851",
   "metadata": {},
   "source": [
    "## Bo 767"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "986f108f-eef3-4815-acb9-ba74cf781081",
   "metadata": {},
   "outputs": [],
   "source": [
    "bo767_path = \"../data/bo767/\"\n",
    "\n",
    "with open('bo767_ids.txt', 'r') as file:\n",
    "    bo767_ids = [line[:-1] for line in file.readlines()]\n",
    "\n",
    "download_pdfs(bo767_path, bo767_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f99e3c07-7c39-4944-adbb-28f1f6d39f16",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
